{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4e7930ab",
   "metadata": {},
   "source": [
    "Counts of genres from both full-genre labeled sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "917294ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "TT_set = pd.read_csv('tiktok_grouplabels_revised.csv')\n",
    "YT_set = pd.read_csv('YT_grouplabels_revised_final.csv')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c1783b6",
   "metadata": {},
   "outputs": [],
   "source": [
    "rotten_creators = [\"Noah Schnapp\", \"Oneya D'Amelio\", \"Anokhina Liza\"]\n",
    "#creators with too many dead links"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bb44f81",
   "metadata": {},
   "outputs": [],
   "source": [
    "TT_set = TT_set[~TT_set['user'].isin(rotten_creators)].copy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "67d607f7",
   "metadata": {},
   "outputs": [],
   "source": [
    "TT_set.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0d4310ba",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove dead link rows\n",
    "TT_set = TT_set[TT_set['Dead link'].astype(str).str.strip() != 'x'].copy()\n",
    "\n",
    "print(\"TT_set - Final number of labeled videos after removing dead links:\", len(TT_set))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "89a75ce1",
   "metadata": {},
   "outputs": [],
   "source": [
    "columns_to_keep = [17, 18, 24, 26, 29, 30, 34, 36, 39, 40, 41, 42]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5ceb4959",
   "metadata": {},
   "outputs": [],
   "source": [
    "#examine target columns\n",
    "all_columns = TT_set.columns\n",
    "=\n",
    "columns_to_keep_names = [all_columns[i] for i in columns_to_keep]\n",
    "\n",
    "final_columns = list(all_columns[:17]) + columns_to_keep_names\n",
    "\n",
    "TT_set = TT_set[final_columns].copy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5eb3dc5e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "print(\"Columns at index 17–28:\")\n",
    "print(TT_set.columns[17:29].tolist())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "81ac601e",
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "total_videos = len(TT_set)\n",
    "\n",
    "genre_columns = [\n",
    "    'Prank',\n",
    "    'HPV (hypnotic process video)',\n",
    "    'Media Commentary/Reaction',\n",
    "    'Human Experiment',\n",
    "    'CPB Common Person Depiction/Experience',\n",
    "    'Music/Dance/Acting',\n",
    "    'Vlog',\n",
    "    'Skit',\n",
    "    'Stunts/Sports',\n",
    "    'Facial Animation/Cosplay',\n",
    "    'Advertisement (Overt)',\n",
    "    'Other'\n",
    "]\n",
    "\n",
    "# Count and percent for each genre\n",
    "genre_percentages = {}\n",
    "\n",
    "for genre in genre_columns:\n",
    "    count = (TT_set[genre] == 1).sum()\n",
    "    percent = count / total_videos * 100\n",
    "    genre_percentages[genre] = (count, round(percent, 2))\n",
    "\n",
    "for genre, (count, percent) in genre_percentages.items():\n",
    "    print(f\"{genre}: {count} videos ({percent}%)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2180fb43",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "#bootstrap and cis\n",
    "grouped_tt = TT_set.groupby('user')\n",
    "users = list(grouped_tt.groups.keys())\n",
    "user_dict = {user: grouped_tt.get_group(user) for user in users}\n",
    "\n",
    "n_iterations = 10000\n",
    "results = {}\n",
    "\n",
    "for genre in genre_columns:\n",
    "    bootstrap_dist = []\n",
    "\n",
    "    for _ in range(n_iterations):\n",
    "        sampled_users = np.random.choice(users, size=len(users), replace=True)\n",
    "        sampled_df = pd.concat([user_dict[user] for user in sampled_users])\n",
    "\n",
    "        count = (sampled_df[genre] == 1).sum()\n",
    "        percentage = count / len(sampled_df)\n",
    "        bootstrap_dist.append(percentage)\n",
    "\n",
    "    lower = np.percentile(bootstrap_dist, 2.5) * 100\n",
    "    upper = np.percentile(bootstrap_dist, 97.5) * 100\n",
    "    results[genre] = (round(lower, 2), round(upper, 2))\n",
    "    print(f\"{genre}: 95% CI = {round(lower, 2)}% to {round(upper, 2)}%\")\n",
    "\n",
    "genre_cis_df = pd.DataFrame.from_dict(results, orient='index', columns=['CI Lower', 'CI Upper'])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73610640",
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "genre_labels = [\n",
    "    \"Pranks\", \"Mesmeric Process Videos\", \"Media Commentary/Reaction\",\n",
    "    \"Human Experiment\", \"Common Person/Experience Depiction\", \"Music/Dance/Acting\",\n",
    "    \"Vlog\", \"Skit\", \"Stunts/Sports\", \"Facial Animation/Cosplay\",\n",
    "    \"Advertisement (Overt)\", \"Other\"\n",
    "]\n",
    "\n",
    "percentages = [5.99, 14.88, 8.89, 5.08, 4.9, 35.21, 24.86, 12.34, 2.54, 5.81, 6.35, 2.0]\n",
    "colors = ['red'] + ['blue'] * (len(genre_labels) - 1)\n",
    "\n",
    "\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.bar(genre_labels, percentages, color=colors)\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.title(\"TikTok Set 2 Genre Category Percentages\")\n",
    "plt.ylabel(\"Percentage of (Live) Videos\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "252e0730",
   "metadata": {},
   "outputs": [],
   "source": [
    "#youtube"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1622d217",
   "metadata": {},
   "outputs": [],
   "source": [
    "YT_set.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "254545bb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Remove the one dead link\n",
    "YT_set = YT_set[YT_set['Deadlink'].astype(str).str.strip() != 'x'].copy()\n",
    "\n",
    "print(\"YT_set - Final number of labeled videos:\", len(YT_set))  # Should print 489\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cf722893",
   "metadata": {},
   "outputs": [],
   "source": [
    "#target columns\n",
    "all_columns = YT_set.columns\n",
    "\n",
    "keep_after_13 = [14, 16, 22, 23, 26, 27, 30, 32, 33, 34, 38, 39, 40, 41, 42, 43, 44]\n",
    "\n",
    "columns_to_keep = list(all_columns[:14]) + [all_columns[i] for i in keep_after_13]\n",
    "\n",
    "YT_set = YT_set[columns_to_keep].copy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f44be3f1",
   "metadata": {},
   "outputs": [],
   "source": [
    "YT_set.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4cf5f06b",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Rename column\n",
    "YT_set.rename(columns={YT_set.columns[15]: 'HPV'}, inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dc0bceb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"Columns from index 14 to 30:\")\n",
    "print(YT_set.columns[14:31].tolist())\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57803198",
   "metadata": {},
   "outputs": [],
   "source": [
    "#counts and percents\n",
    "total_videos = len(YT_set)\n",
    "\n",
    "genre_columns = [\n",
    "    'Prank ', 'HPV', 'Media Commentary or Reaction', 'Human Experiment',\n",
    "    'Common Person/Experience Depiction', 'Gaming', 'Vlog', 'Video Essay',\n",
    "    'How To', 'Skit', 'Trailer/Teaser', 'Movie Clip/TV Clip',\n",
    "    'Music Video/performance', 'Parody/comedy', 'Stunts/Sports',\n",
    "    'Ads', 'Other '\n",
    "]\n",
    "\n",
    "genre_columns = [col.strip() for col in genre_columns]\n",
    "\n",
    "YT_set.columns = [col.strip() for col in YT_set.columns]\n",
    "\n",
    "genre_counts = {}\n",
    "for genre in genre_columns:\n",
    "    count = (YT_set[genre] == 1).sum()\n",
    "    percent = round((count / total_videos) * 100, 2)\n",
    "    genre_counts[genre] = (count, percent)\n",
    "\n",
    "for genre, (count, percent) in genre_counts.items():\n",
    "    print(f\"{genre}: {count} videos ({percent}%)\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1e366e33",
   "metadata": {},
   "outputs": [],
   "source": [
    "selected_genres = [\n",
    "    'Prank',\n",
    "    'HPV',\n",
    "    'Media Commentary or Reaction',\n",
    "    'Human Experiment',\n",
    "    'Common Person/Experience Depiction',\n",
    "    'Gaming',\n",
    "    'Vlog',\n",
    "    'Video Essay',\n",
    "    'How To',\n",
    "    'Skit',\n",
    "    'Stunts/Sports',\n",
    "    'Ads',\n",
    "    'Other'\n",
    "]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "11cc6ba4",
   "metadata": {},
   "outputs": [],
   "source": [
    "#bootstrap and cis\n",
    "\n",
    "grouped_yt = YT_set.groupby('channel_title')\n",
    "channels = list(grouped_yt.groups.keys())\n",
    "channel_dict = {channel: grouped_yt.get_group(channel) for channel in channels}\n",
    "\n",
    "n_iterations = 10000\n",
    "genre_cis = {}\n",
    "\n",
    "for genre in selected_genres:\n",
    "    bootstrap_dist = []\n",
    "\n",
    "    for _ in range(n_iterations):\n",
    "        sampled_channels = np.random.choice(channels, size=len(channels), replace=True)\n",
    "        sampled_df = pd.concat([channel_dict[channel] for channel in sampled_channels])\n",
    "        \n",
    "        count = (sampled_df[genre] == 1).sum()\n",
    "        proportion = count / len(sampled_df)\n",
    "        bootstrap_dist.append(proportion)\n",
    "\n",
    "    lower = np.percentile(bootstrap_dist, 2.5) * 100\n",
    "    upper = np.percentile(bootstrap_dist, 97.5) * 100\n",
    "    genre_cis[genre] = (round(lower, 2), round(upper, 2))\n",
    "\n",
    "    print(f\"{genre}: 95% CI = {round(lower, 2)}% to {round(upper, 2)}%\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f8679d37",
   "metadata": {},
   "outputs": [],
   "source": [
    "#plot\n",
    "\n",
    "genre_labels = [\n",
    "    \"Prank\", \"Mesmeric Process Videos\", \"Media Commentary/Reaction\", \"Human Experiment\",\n",
    "    \"Common Person/Experience Depiction\", \"Gaming\", \"Vlog\", \"Video Essay\",\n",
    "    \"How To\", \"Skit\", \"Stunts/Sports\", \"Advertisement (Overt)\", \"Other\"\n",
    "]\n",
    "\n",
    "percentages = [2.86, 10.63, 8.59, 8.79, 2.66, 30.27, 14.31, 3.48, 4.91, 19.63, 1.23, 1.23, 0.0]\n",
    "colors = ['red'] + ['blue'] * (len(genre_labels) - 1)\n",
    "\n",
    "# Plot\n",
    "plt.figure(figsize=(12, 6))\n",
    "plt.bar(genre_labels, percentages, color=colors)\n",
    "plt.xticks(rotation=45, ha='right')\n",
    "plt.title(\"YouTube Set 2 Genre Category Percentages\")\n",
    "plt.ylabel(\"Percentage of (Live) Videos\")\n",
    "plt.tight_layout()\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "222d2213",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
